In [44]:
import pyspark.sql.functions as sql
import pyspark.sql.types as types
In [6]:
idb_df_version = "20170130"
idb_df = sqlContext.read.parquet("/guoda/data/idigbio-{0}.parquet".format(idb_df_version))
idb_df.count()
Out[6]:
To test our code, find one collection that seems to have numeric ids. Go to the search API and find the most common catalog number:
http://search.idigbio.org/v2/summary/top/records?top_fields=catalognumber&count=100
(It is possible that some collections use "good" catalog numbers that are UUIDs or other forms of GUIDs. This work can never apply to them.)
And then look for collection codes that use something popular like "100":
Looks like the recordset a6eee223-cf3b-4079-8bb2-b77dad8cae9d has 4 records with this number and 6M records so it sounds interesting but let's start with 271a9ce9-c6d3-4b63-a722-cb0adc48863f since it has 1.8M records and will be a bit easier to work with.
In [16]:
subset = (idb_df
.select(idb_df.catalognumber)
.where(idb_df.recordset == "271a9ce9-c6d3-4b63-a722-cb0adc48863f")
)
subset.cache()
subset.count()
Out[16]:
In [17]:
print(subset.where(subset.catalognumber == "0").count())
In [45]:
def to_int(s):
try:
return int(s)
except:
# 0 is a terrible flag value but it is graphable so we can see
# how bad things are
return 0
to_int_udf = sql.udf(to_int, types.IntegerType())
In [46]:
catalognumbers = (subset
.withColumn("number", to_int_udf(subset.catalognumber))
)
In [47]:
print(catalognumbers.where(catalognumbers.number == 0).count())
In [48]:
catalognumbers.where(catalognumbers.number == 0).head(10)
Out[48]:
In [49]:
catalognumbers.where("catalognumber='r-70049' OR catalognumber='70049'").head(10)
Out[49]:
Well there certainly is and there are repeated catalog numbers too. We certainly can't assume that 'r-70049' is the same as '70049' and if we could we certainly can't guess at the catalog number practices across collections.
In [50]:
catalognumbers_pd = (catalognumbers
.select(catalognumbers.number)
.sort(catalognumbers.number)
.toPandas()
)
In [51]:
catalognumbers_pd.describe()
Out[51]:
In [53]:
catalognumbers_pd[-10:]
Out[53]:
In [65]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(catalognumbers_pd.index.values[:-3],
catalognumbers_pd[:-3]["number"])
plt.axis([0, 2000000, 0, 2000000])
plt.xlabel("Rank of specimen record")
plt.ylabel("Numeric catalog number")
plt.title("Catalog numbers from Museum of Comparative Zoology,\nHarvard University (271a9ce9-c6d3-4b63-a722-cb0adc48863f)")
Out[65]:
Let's zoom in on some interesting parts.
In [75]:
x_start = 1418000
x_end = 1419000
y_start = 199000
y_end = 201000
plt.plot(catalognumbers_pd.index.values[x_start:x_end],
catalognumbers_pd[x_start:x_end]["number"])
plt.axis([x_start, x_end, y_start, y_end])
Out[75]:
In [83]:
x_start = 1850000
x_end = 1875000
y_start = 600000
y_end = 700000
plt.plot(catalognumbers_pd.index.values[x_start:x_end],
catalognumbers_pd[x_start:x_end]["number"])
plt.axis([x_start, x_end, y_start, y_end])
Out[83]: